1 package org.apache.lucene.search.suggest.analyzing;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import java.io.IOException;
21 import java.io.Reader;
22 import java.util.ArrayList;
23 import java.util.Arrays;
24 import java.util.Collections;
25 import java.util.Comparator;
26 import java.util.HashSet;
27 import java.util.List;
28 import java.util.Set;
29 import java.util.TreeSet;
30
31 import org.apache.lucene.analysis.Analyzer;
32 import org.apache.lucene.analysis.CannedTokenStream;
33 import org.apache.lucene.analysis.MockAnalyzer;
34 import org.apache.lucene.analysis.MockTokenFilter;
35 import org.apache.lucene.analysis.MockTokenizer;
36 import org.apache.lucene.analysis.Token;
37 import org.apache.lucene.analysis.TokenFilter;
38 import org.apache.lucene.analysis.TokenStream;
39 import org.apache.lucene.analysis.TokenStreamToAutomaton;
40 import org.apache.lucene.analysis.Tokenizer;
41 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
42 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
43 import org.apache.lucene.search.suggest.Input;
44 import org.apache.lucene.search.suggest.InputArrayIterator;
45 import org.apache.lucene.search.suggest.Lookup.LookupResult;
46 import org.apache.lucene.util.BytesRef;
47 import org.apache.lucene.util.BytesRefBuilder;
48 import org.apache.lucene.util.IntsRef;
49 import org.apache.lucene.util.LuceneTestCase;
50 import org.apache.lucene.util.TestUtil;
51 import org.apache.lucene.util.automaton.Automaton;
52 import org.apache.lucene.util.automaton.FiniteStringsIterator;
53 import org.apache.lucene.util.fst.Util;
54
55 public class FuzzySuggesterTest extends LuceneTestCase {
56
57 public void testRandomEdits() throws IOException {
58 List<Input> keys = new ArrayList<>();
59 int numTerms = atLeast(100);
60 for (int i = 0; i < numTerms; i++) {
61 keys.add(new Input("boo" + TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));
62 }
63 keys.add(new Input("foo bar boo far", 12));
64 MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
65 FuzzySuggester suggester = new FuzzySuggester(analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, true, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS,
66 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, FuzzySuggester.DEFAULT_UNICODE_AWARE);
67 suggester.build(new InputArrayIterator(keys));
68 int numIters = atLeast(10);
69 for (int i = 0; i < numIters; i++) {
70 String addRandomEdit = addRandomEdit("foo bar boo", FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX);
71 List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);
72 assertEquals(addRandomEdit, 1, results.size());
73 assertEquals("foo bar boo far", results.get(0).key.toString());
74 assertEquals(12, results.get(0).value, 0.01F);
75 }
76 analyzer.close();
77 }
78
79 public void testNonLatinRandomEdits() throws IOException {
80 List<Input> keys = new ArrayList<>();
81 int numTerms = atLeast(100);
82 for (int i = 0; i < numTerms; i++) {
83 keys.add(new Input("буу" + TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));
84 }
85 keys.add(new Input("фуу бар буу фар", 12));
86 MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
87 FuzzySuggester suggester = new FuzzySuggester(analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, true, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS,
88 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, true);
89 suggester.build(new InputArrayIterator(keys));
90 int numIters = atLeast(10);
91 for (int i = 0; i < numIters; i++) {
92 String addRandomEdit = addRandomEdit("фуу бар буу", 0);
93 List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);
94 assertEquals(addRandomEdit, 1, results.size());
95 assertEquals("фуу бар буу фар", results.get(0).key.toString());
96 assertEquals(12, results.get(0).value, 0.01F);
97 }
98 analyzer.close();
99 }
100
101
102 public void testKeyword() throws Exception {
103 Input keys[] = new Input[] {
104 new Input("foo", 50),
105 new Input("bar", 10),
106 new Input("barbar", 12),
107 new Input("barbara", 6)
108 };
109
110 Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
111 FuzzySuggester suggester = new FuzzySuggester(analyzer);
112 suggester.build(new InputArrayIterator(keys));
113
114 List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("bariar", random()), false, 2);
115 assertEquals(2, results.size());
116 assertEquals("barbar", results.get(0).key.toString());
117 assertEquals(12, results.get(0).value, 0.01F);
118
119 results = suggester.lookup(TestUtil.stringToCharSequence("barbr", random()), false, 2);
120 assertEquals(2, results.size());
121 assertEquals("barbar", results.get(0).key.toString());
122 assertEquals(12, results.get(0).value, 0.01F);
123
124 results = suggester.lookup(TestUtil.stringToCharSequence("barbara", random()), false, 2);
125 assertEquals(2, results.size());
126 assertEquals("barbara", results.get(0).key.toString());
127 assertEquals(6, results.get(0).value, 0.01F);
128
129 results = suggester.lookup(TestUtil.stringToCharSequence("barbar", random()), false, 2);
130 assertEquals(2, results.size());
131 assertEquals("barbar", results.get(0).key.toString());
132 assertEquals(12, results.get(0).value, 0.01F);
133 assertEquals("barbara", results.get(1).key.toString());
134 assertEquals(6, results.get(1).value, 0.01F);
135
136 results = suggester.lookup(TestUtil.stringToCharSequence("barbaa", random()), false, 2);
137 assertEquals(2, results.size());
138 assertEquals("barbar", results.get(0).key.toString());
139 assertEquals(12, results.get(0).value, 0.01F);
140 assertEquals("barbara", results.get(1).key.toString());
141 assertEquals(6, results.get(1).value, 0.01F);
142
143
144 results = suggester.lookup(TestUtil.stringToCharSequence("f", random()), false, 2);
145 assertEquals(1, results.size());
146 assertEquals("foo", results.get(0).key.toString());
147 assertEquals(50, results.get(0).value, 0.01F);
148
149
150
151 results = suggester.lookup(TestUtil.stringToCharSequence("bar", random()), false, 1);
152 assertEquals(1, results.size());
153 assertEquals("bar", results.get(0).key.toString());
154 assertEquals(10, results.get(0).value, 0.01F);
155
156
157 results = suggester.lookup(TestUtil.stringToCharSequence("b", random()), false, 2);
158 assertEquals(2, results.size());
159 assertEquals("barbar", results.get(0).key.toString());
160 assertEquals(12, results.get(0).value, 0.01F);
161 assertEquals("bar", results.get(1).key.toString());
162 assertEquals(10, results.get(1).value, 0.01F);
163
164
165 results = suggester.lookup(TestUtil.stringToCharSequence("ba", random()), false, 3);
166 assertEquals(3, results.size());
167 assertEquals("barbar", results.get(0).key.toString());
168 assertEquals(12, results.get(0).value, 0.01F);
169 assertEquals("bar", results.get(1).key.toString());
170 assertEquals(10, results.get(1).value, 0.01F);
171 assertEquals("barbara", results.get(2).key.toString());
172 assertEquals(6, results.get(2).value, 0.01F);
173
174 analyzer.close();
175 }
176
177
178
179
180 public void testStandard() throws Exception {
181 Input keys[] = new Input[] {
182 new Input("the ghost of christmas past", 50),
183 };
184
185 Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
186 FuzzySuggester suggester = new FuzzySuggester(standard, standard, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, false, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS,
187 FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, FuzzySuggester.DEFAULT_UNICODE_AWARE);
188 suggester.build(new InputArrayIterator(keys));
189
190 List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
191 assertEquals(1, results.size());
192 assertEquals("the ghost of christmas past", results.get(0).key.toString());
193 assertEquals(50, results.get(0).value, 0.01F);
194
195
196 results = suggester.lookup(TestUtil.stringToCharSequence("ghost of chris", random()), false, 1);
197 assertEquals(1, results.size());
198 assertEquals("the ghost of christmas past", results.get(0).key.toString());
199 assertEquals(50, results.get(0).value, 0.01F);
200
201
202 results = suggester.lookup(TestUtil.stringToCharSequence("ghost chris", random()), false, 1);
203 assertEquals(1, results.size());
204 assertEquals("the ghost of christmas past", results.get(0).key.toString());
205 assertEquals(50, results.get(0).value, 0.01F);
206
207 standard.close();
208 }
209
210 public void testNoSeps() throws Exception {
211 Input[] keys = new Input[] {
212 new Input("ab cd", 0),
213 new Input("abcd", 1),
214 };
215
216 int options = 0;
217
218 Analyzer a = new MockAnalyzer(random());
219 FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, true, 1, true, 1, 3, false);
220 suggester.build(new InputArrayIterator(keys));
221
222
223
224
225 List<LookupResult> r = suggester.lookup(TestUtil.stringToCharSequence("ab c", random()), false, 2);
226 assertEquals(2, r.size());
227
228
229
230
231 assertEquals("abcd", r.get(0).key.toString());
232 a.close();
233 }
234
235 public void testGraphDups() throws Exception {
236
237 final Analyzer analyzer = new Analyzer() {
238 @Override
239 protected TokenStreamComponents createComponents(String fieldName) {
240 Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
241
242 return new TokenStreamComponents(tokenizer) {
243 int tokenStreamCounter = 0;
244 final TokenStream[] tokenStreams = new TokenStream[] {
245 new CannedTokenStream(new Token[] {
246 token("wifi",1,1),
247 token("hotspot",0,2),
248 token("network",1,1),
249 token("is",1,1),
250 token("slow",1,1)
251 }),
252 new CannedTokenStream(new Token[] {
253 token("wi",1,1),
254 token("hotspot",0,3),
255 token("fi",1,1),
256 token("network",1,1),
257 token("is",1,1),
258 token("fast",1,1)
259
260 }),
261 new CannedTokenStream(new Token[] {
262 token("wifi",1,1),
263 token("hotspot",0,2),
264 token("network",1,1)
265 }),
266 };
267
268 @Override
269 public TokenStream getTokenStream() {
270 TokenStream result = tokenStreams[tokenStreamCounter];
271 tokenStreamCounter++;
272 return result;
273 }
274
275 @Override
276 protected void setReader(final Reader reader) {
277 }
278 };
279 }
280 };
281
282 Input keys[] = new Input[] {
283 new Input("wifi network is slow", 50),
284 new Input("wi fi network is fast", 10),
285 };
286 FuzzySuggester suggester = new FuzzySuggester(analyzer);
287 suggester.build(new InputArrayIterator(keys));
288
289 List<LookupResult> results = suggester.lookup("wifi network", false, 10);
290 if (VERBOSE) {
291 System.out.println("Results: " + results);
292 }
293 assertEquals(2, results.size());
294 assertEquals("wifi network is slow", results.get(0).key);
295 assertEquals(50, results.get(0).value);
296 assertEquals("wi fi network is fast", results.get(1).key);
297 assertEquals(10, results.get(1).value);
298 analyzer.close();
299 }
300
301 public void testEmpty() throws Exception {
302 Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
303 FuzzySuggester suggester = new FuzzySuggester(analyzer);
304 suggester.build(new InputArrayIterator(new Input[0]));
305
306 List<LookupResult> result = suggester.lookup("a", false, 20);
307 assertTrue(result.isEmpty());
308 analyzer.close();
309 }
310
311 public void testInputPathRequired() throws Exception {
312
313
314
315
316
317
318
319
320
321 final Analyzer analyzer = new Analyzer() {
322 @Override
323 protected TokenStreamComponents createComponents(String fieldName) {
324 Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
325
326 return new TokenStreamComponents(tokenizer) {
327 int tokenStreamCounter = 0;
328 final TokenStream[] tokenStreams = new TokenStream[] {
329 new CannedTokenStream(new Token[] {
330 token("ab",1,1),
331 token("ba",0,1),
332 token("xc",1,1)
333 }),
334 new CannedTokenStream(new Token[] {
335 token("ba",1,1),
336 token("xd",1,1)
337 }),
338 new CannedTokenStream(new Token[] {
339 token("ab",1,1),
340 token("ba",0,1),
341 token("x",1,1)
342 })
343 };
344
345 @Override
346 public TokenStream getTokenStream() {
347 TokenStream result = tokenStreams[tokenStreamCounter];
348 tokenStreamCounter++;
349 return result;
350 }
351
352 @Override
353 protected void setReader(final Reader reader) {
354 }
355 };
356 }
357 };
358
359 Input keys[] = new Input[] {
360 new Input("ab xc", 50),
361 new Input("ba xd", 50),
362 };
363 FuzzySuggester suggester = new FuzzySuggester(analyzer);
364 suggester.build(new InputArrayIterator(keys));
365 List<LookupResult> results = suggester.lookup("ab x", false, 1);
366 assertTrue(results.size() == 1);
367 analyzer.close();
368 }
369
370 private static Token token(String term, int posInc, int posLength) {
371 final Token t = new Token(term, 0, 0);
372 t.setPositionIncrement(posInc);
373 t.setPositionLength(posLength);
374 return t;
375 }
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395 private final Analyzer getUnusualAnalyzer() {
396 return new Analyzer() {
397 @Override
398 protected TokenStreamComponents createComponents(String fieldName) {
399 Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
400
401 return new TokenStreamComponents(tokenizer) {
402
403 int count;
404
405 @Override
406 public TokenStream getTokenStream() {
407
408
409 if (count++ != 3) {
410 return new CannedTokenStream(new Token[] {
411 token("a", 1, 1),
412 });
413 } else {
414
415 return new CannedTokenStream(new Token[] {
416 token("a", 1, 1),
417 token("b", 1, 1),
418 });
419 }
420 }
421
422 @Override
423 protected void setReader(final Reader reader) {
424 }
425 };
426 }
427 };
428 }
429
430 public void testExactFirst() throws Exception {
431
432 Analyzer a = getUnusualAnalyzer();
433 FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, true, 1, true, 1, 3, false);
434 suggester.build(new InputArrayIterator(new Input[] {
435 new Input("x y", 1),
436 new Input("x y z", 3),
437 new Input("x", 2),
438 new Input("z z z", 20),
439 }));
440
441
442
443 for(int topN=1;topN<6;topN++) {
444 List<LookupResult> results = suggester.lookup("x y", false, topN);
445
446
447 assertEquals(Math.min(topN, 4), results.size());
448
449 assertEquals("x y", results.get(0).key);
450 assertEquals(1, results.get(0).value);
451
452 if (topN > 1) {
453 assertEquals("z z z", results.get(1).key);
454 assertEquals(20, results.get(1).value);
455
456 if (topN > 2) {
457 assertEquals("x y z", results.get(2).key);
458 assertEquals(3, results.get(2).value);
459
460 if (topN > 3) {
461 assertEquals("x", results.get(3).key);
462 assertEquals(2, results.get(3).value);
463 }
464 }
465 }
466 }
467 a.close();
468 }
469
470 public void testNonExactFirst() throws Exception {
471
472 Analyzer a = getUnusualAnalyzer();
473 FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, true, 1, true, 1, 3, false);
474
475 suggester.build(new InputArrayIterator(new Input[] {
476 new Input("x y", 1),
477 new Input("x y z", 3),
478 new Input("x", 2),
479 new Input("z z z", 20),
480 }));
481
482 for(int topN=1;topN<6;topN++) {
483 List<LookupResult> results = suggester.lookup("p", false, topN);
484
485 assertEquals(Math.min(topN, 4), results.size());
486
487 assertEquals("z z z", results.get(0).key);
488 assertEquals(20, results.get(0).value);
489
490 if (topN > 1) {
491 assertEquals("x y z", results.get(1).key);
492 assertEquals(3, results.get(1).value);
493
494 if (topN > 2) {
495 assertEquals("x", results.get(2).key);
496 assertEquals(2, results.get(2).value);
497
498 if (topN > 3) {
499 assertEquals("x y", results.get(3).key);
500 assertEquals(1, results.get(3).value);
501 }
502 }
503 }
504 }
505 a.close();
506 }
507
508
509 private static class TermFreqPayload2 implements Comparable<TermFreqPayload2> {
510 public final String surfaceForm;
511 public final String analyzedForm;
512 public final long weight;
513
514 public TermFreqPayload2(String surfaceForm, String analyzedForm, long weight) {
515 this.surfaceForm = surfaceForm;
516 this.analyzedForm = analyzedForm;
517 this.weight = weight;
518 }
519
520 @Override
521 public int compareTo(TermFreqPayload2 other) {
522 int cmp = analyzedForm.compareTo(other.analyzedForm);
523 if (cmp != 0) {
524 return cmp;
525 } else if (weight > other.weight) {
526 return -1;
527 } else if (weight < other.weight) {
528 return 1;
529 } else {
530 assert false;
531 return 0;
532 }
533 }
534 }
535
536 static boolean isStopChar(char ch, int numStopChars) {
537
538 return (ch - 'a') < numStopChars;
539 }
540
541
542 private static class TokenEater extends TokenFilter {
543 private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
544 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
545 private final int numStopChars;
546 private final boolean preserveHoles;
547 private boolean first;
548
549 public TokenEater(boolean preserveHoles, TokenStream in, int numStopChars) {
550 super(in);
551 this.preserveHoles = preserveHoles;
552 this.numStopChars = numStopChars;
553 }
554
555 @Override
556 public void reset() throws IOException {
557 super.reset();
558 first = true;
559 }
560
561 @Override
562 public final boolean incrementToken() throws IOException {
563 int skippedPositions = 0;
564 while (input.incrementToken()) {
565 if (termAtt.length() != 1 || !isStopChar(termAtt.charAt(0), numStopChars)) {
566 int posInc = posIncrAtt.getPositionIncrement() + skippedPositions;
567 if (first) {
568 if (posInc == 0) {
569
570 posInc = 1;
571 }
572 first = false;
573 }
574 posIncrAtt.setPositionIncrement(posInc);
575
576 return true;
577 }
578 if (preserveHoles) {
579 skippedPositions += posIncrAtt.getPositionIncrement();
580 }
581 }
582
583 return false;
584 }
585 }
586
587 private static class MockTokenEatingAnalyzer extends Analyzer {
588 private int numStopChars;
589 private boolean preserveHoles;
590
591 public MockTokenEatingAnalyzer(int numStopChars, boolean preserveHoles) {
592 this.preserveHoles = preserveHoles;
593 this.numStopChars = numStopChars;
594 }
595
596 @Override
597 public TokenStreamComponents createComponents(String fieldName) {
598 MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
599 tokenizer.setEnableChecks(true);
600 TokenStream next;
601 if (numStopChars != 0) {
602 next = new TokenEater(preserveHoles, tokenizer, numStopChars);
603 } else {
604 next = tokenizer;
605 }
606 return new TokenStreamComponents(tokenizer, next);
607 }
608 }
609
610 public void testRandom() throws Exception {
611
612 int numQueries = atLeast(100);
613
614 final List<TermFreqPayload2> slowCompletor = new ArrayList<>();
615 final TreeSet<String> allPrefixes = new TreeSet<>();
616 final Set<String> seen = new HashSet<>();
617
618 Input[] keys = new Input[numQueries];
619
620 boolean preserveSep = random().nextBoolean();
621 boolean unicodeAware = random().nextBoolean();
622
623 final int numStopChars = random().nextInt(10);
624 final boolean preserveHoles = random().nextBoolean();
625
626 if (VERBOSE) {
627 System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " ; unicodeAware=" + unicodeAware + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles);
628 }
629
630 for (int i = 0; i < numQueries; i++) {
631 int numTokens = TestUtil.nextInt(random(), 1, 4);
632 String key;
633 String analyzedKey;
634 while(true) {
635 key = "";
636 analyzedKey = "";
637 boolean lastRemoved = false;
638 for(int token=0;token < numTokens;token++) {
639 String s;
640 while (true) {
641
642
643 s = TestUtil.randomSimpleString(random());
644 if (s.length() > 0) {
645 if (token > 0) {
646 key += " ";
647 }
648 if (preserveSep && analyzedKey.length() > 0 && (unicodeAware ? analyzedKey.codePointAt(analyzedKey.codePointCount(0, analyzedKey.length())-1) != ' ' : analyzedKey.charAt(analyzedKey.length()-1) != ' ')) {
649 analyzedKey += " ";
650 }
651 key += s;
652 if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {
653 if (preserveSep && preserveHoles) {
654 analyzedKey += '\u0000';
655 }
656 lastRemoved = true;
657 } else {
658 analyzedKey += s;
659 lastRemoved = false;
660 }
661 break;
662 }
663 }
664 }
665
666 analyzedKey = analyzedKey.replaceAll("(^| )\u0000$", "");
667
668 if (preserveSep && lastRemoved) {
669 analyzedKey += " ";
670 }
671
672
673 if (!seen.contains(key)) {
674 seen.add(key);
675 break;
676 }
677 }
678
679 for (int j = 1; j < key.length(); j++) {
680 allPrefixes.add(key.substring(0, j));
681 }
682
683 int weight = random().nextInt(1<<24);
684 keys[i] = new Input(key, weight);
685
686 slowCompletor.add(new TermFreqPayload2(key, analyzedKey, weight));
687 }
688
689 if (VERBOSE) {
690
691
692 List<TermFreqPayload2> sorted = new ArrayList<>(slowCompletor);
693 Collections.sort(sorted);
694 for(TermFreqPayload2 ent : sorted) {
695 System.out.println(" surface='" + ent.surfaceForm + " analyzed='" + ent.analyzedForm + "' weight=" + ent.weight);
696 }
697 }
698
699 Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
700 FuzzySuggester suggester = new FuzzySuggester(a, a,
701 preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, true, 1, false, 1, 3, unicodeAware);
702 suggester.build(new InputArrayIterator(keys));
703
704 for (String prefix : allPrefixes) {
705
706 if (VERBOSE) {
707 System.out.println("\nTEST: prefix=" + prefix);
708 }
709
710 final int topN = TestUtil.nextInt(random(), 1, 10);
711 List<LookupResult> r = suggester.lookup(TestUtil.stringToCharSequence(prefix, random()), false, topN);
712
713
714 List<LookupResult> matches = new ArrayList<>();
715
716
717 String[] tokens = prefix.split(" ");
718 StringBuilder builder = new StringBuilder();
719 boolean lastRemoved = false;
720 for(int i=0;i<tokens.length;i++) {
721 String token = tokens[i];
722 if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(" ")) {
723 builder.append(' ');
724 }
725
726 if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) {
727 if (preserveSep && preserveHoles) {
728 builder.append("\u0000");
729 }
730 lastRemoved = true;
731 } else {
732 builder.append(token);
733 lastRemoved = false;
734 }
735 }
736
737 String analyzedKey = builder.toString();
738
739
740
741
742 while (true) {
743 String s = analyzedKey.replaceAll("(^| )\u0000$", "");
744 s = s.replaceAll("\\s+$", "");
745 if (s.equals(analyzedKey)) {
746 break;
747 }
748 analyzedKey = s;
749 }
750
751 if (analyzedKey.length() == 0) {
752
753
754 continue;
755 }
756
757 if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
758 analyzedKey += " ";
759 }
760
761 if (VERBOSE) {
762 System.out.println(" analyzed: " + analyzedKey);
763 }
764 TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton();
765
766
767
768
769
770 Automaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)));
771 assertTrue(automaton.isDeterministic());
772
773
774 BytesRefBuilder spare = new BytesRefBuilder();
775 for (TermFreqPayload2 e : slowCompletor) {
776 spare.copyChars(e.analyzedForm);
777 FiniteStringsIterator finiteStrings =
778 new FiniteStringsIterator(suggester.toAutomaton(spare.get(), tokenStreamToAutomaton));
779 for (IntsRef string; (string = finiteStrings.next()) != null;) {
780 int p = 0;
781 BytesRef ref = Util.toBytesRef(string, spare);
782 boolean added = false;
783 for (int i = ref.offset; i < ref.length; i++) {
784 int q = automaton.step(p, ref.bytes[i] & 0xff);
785 if (q == -1) {
786 break;
787 } else if (automaton.isAccept(q)) {
788 matches.add(new LookupResult(e.surfaceForm, e.weight));
789 added = true;
790 break;
791 }
792 p = q;
793 }
794 if (!added && automaton.isAccept(p)) {
795 matches.add(new LookupResult(e.surfaceForm, e.weight));
796 }
797 }
798 }
799
800 assertTrue(numStopChars > 0 || matches.size() > 0);
801
802 if (matches.size() > 1) {
803 Collections.sort(matches, new Comparator<LookupResult>() {
804 @Override
805 public int compare(LookupResult left, LookupResult right) {
806 int cmp = Float.compare(right.value, left.value);
807 if (cmp == 0) {
808 return left.compareTo(right);
809 } else {
810 return cmp;
811 }
812 }
813 });
814 }
815
816 if (matches.size() > topN) {
817 matches = matches.subList(0, topN);
818 }
819
820 if (VERBOSE) {
821 System.out.println(" expected:");
822 for(LookupResult lr : matches) {
823 System.out.println(" key=" + lr.key + " weight=" + lr.value);
824 }
825
826 System.out.println(" actual:");
827 for(LookupResult lr : r) {
828 System.out.println(" key=" + lr.key + " weight=" + lr.value);
829 }
830 }
831
832 assertEquals(prefix + " " + topN, matches.size(), r.size());
833 for(int hit=0;hit<r.size();hit++) {
834
835 assertEquals(prefix + " " + topN, matches.get(hit).key.toString(), r.get(hit).key.toString());
836 assertEquals(matches.get(hit).value, r.get(hit).value, 0f);
837 }
838 }
839 a.close();
840 }
841
842 public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception {
843 Analyzer a = new MockAnalyzer(random());
844 FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, true, 1, true, 1, 3, false);
845
846 List<Input> keys = Arrays.asList(new Input[] {
847 new Input("a", 40),
848 new Input("a ", 50),
849 new Input(" a", 60),
850 });
851
852 Collections.shuffle(keys, random());
853 suggester.build(new InputArrayIterator(keys));
854
855 List<LookupResult> results = suggester.lookup("a", false, 5);
856 assertEquals(2, results.size());
857 assertEquals(" a", results.get(0).key);
858 assertEquals(60, results.get(0).value);
859 assertEquals("a ", results.get(1).key);
860 assertEquals(50, results.get(1).value);
861 a.close();
862 }
863
864 public void testEditSeps() throws Exception {
865 Analyzer a = new MockAnalyzer(random());
866 FuzzySuggester suggester = new FuzzySuggester(a, a, FuzzySuggester.PRESERVE_SEP, 2, -1, true, 2, true, 1, 3, false);
867
868 List<Input> keys = Arrays.asList(new Input[] {
869 new Input("foo bar", 40),
870 new Input("foo bar baz", 50),
871 new Input("barbaz", 60),
872 new Input("barbazfoo", 10),
873 });
874
875 Collections.shuffle(keys, random());
876 suggester.build(new InputArrayIterator(keys));
877
878 assertEquals("[foo bar baz/50, foo bar/40]", suggester.lookup("foobar", false, 5).toString());
879 assertEquals("[foo bar baz/50]", suggester.lookup("foobarbaz", false, 5).toString());
880 assertEquals("[barbaz/60, barbazfoo/10]", suggester.lookup("bar baz", false, 5).toString());
881 assertEquals("[barbazfoo/10]", suggester.lookup("bar baz foo", false, 5).toString());
882 a.close();
883 }
884
885 @SuppressWarnings("fallthrough")
886 private static String addRandomEdit(String string, int prefixLength) {
887 char[] input = string.toCharArray();
888 StringBuilder builder = new StringBuilder();
889 for (int i = 0; i < input.length; i++) {
890 if (i >= prefixLength && random().nextBoolean() && i < input.length-1) {
891 switch(random().nextInt(4)) {
892 case 3:
893 if (i < input.length-1) {
894
895 builder.append(input[i+1]);
896 builder.append(input[i]);
897 for(int j=i+2;j<input.length;j++) {
898 builder.append(input[j]);
899 }
900 return builder.toString();
901 }
902
903 case 2:
904
905 for (int j = i+1; j < input.length; j++) {
906 builder.append(input[j]);
907 }
908 return builder.toString();
909 case 1:
910
911 if (i+1<input.length) {
912 builder.append(input[i+1]);
913 builder.append(input[i++]);
914 i++;
915 }
916 for (int j = i; j < input.length; j++) {
917 builder.append(input[j]);
918 }
919 return builder.toString();
920 case 0:
921
922
923
924
925
926 int x = random().nextBoolean() ? random().nextInt(30) : 32 + random().nextInt(128 - 32);
927 builder.append((char) x);
928 for (int j = i; j < input.length; j++) {
929 builder.append(input[j]);
930 }
931 return builder.toString();
932 }
933 }
934
935 builder.append(input[i]);
936 }
937
938 return builder.toString();
939 }
940
941 private String randomSimpleString(int maxLen) {
942 final int len = TestUtil.nextInt(random(), 1, maxLen);
943 final char[] chars = new char[len];
944 for(int j=0;j<len;j++) {
945 chars[j] = (char) ('a' + random().nextInt(4));
946 }
947 return new String(chars);
948 }
949
950 public void testRandom2() throws Throwable {
951 final int NUM = atLeast(200);
952 final List<Input> answers = new ArrayList<>();
953 final Set<String> seen = new HashSet<>();
954 for(int i=0;i<NUM;i++) {
955 final String s = randomSimpleString(8);
956 if (!seen.contains(s)) {
957 answers.add(new Input(s, random().nextInt(1000)));
958 seen.add(s);
959 }
960 }
961
962 Collections.sort(answers, new Comparator<Input>() {
963 @Override
964 public int compare(Input a, Input b) {
965 return a.term.compareTo(b.term);
966 }
967 });
968 if (VERBOSE) {
969 System.out.println("\nTEST: targets");
970 for(Input tf : answers) {
971 System.out.println(" " + tf.term.utf8ToString() + " freq=" + tf.v);
972 }
973 }
974
975 Analyzer a = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
976 int maxEdits = random().nextBoolean() ? 1 : 2;
977 int prefixLen = random().nextInt(4);
978 boolean transpositions = random().nextBoolean();
979
980
981 FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, true, maxEdits, transpositions, prefixLen, prefixLen, false);
982
983 if (VERBOSE) {
984 System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM);
985 }
986
987 Collections.shuffle(answers, random());
988 suggest.build(new InputArrayIterator(answers.toArray(new Input[answers.size()])));
989
990 final int ITERS = atLeast(100);
991 for(int iter=0;iter<ITERS;iter++) {
992 final String frag = randomSimpleString(6);
993 if (VERBOSE) {
994 System.out.println("\nTEST: iter frag=" + frag);
995 }
996 final List<LookupResult> expected = slowFuzzyMatch(prefixLen, maxEdits, transpositions, answers, frag);
997 if (VERBOSE) {
998 System.out.println(" expected: " + expected.size());
999 for(LookupResult c : expected) {
1000 System.out.println(" " + c);
1001 }
1002 }
1003 final List<LookupResult> actual = suggest.lookup(frag, false, NUM);
1004 if (VERBOSE) {
1005 System.out.println(" actual: " + actual.size());
1006 for(LookupResult c : actual) {
1007 System.out.println(" " + c);
1008 }
1009 }
1010
1011 Collections.sort(actual, new CompareByCostThenAlpha());
1012
1013 final int limit = Math.min(expected.size(), actual.size());
1014 for(int ans=0;ans<limit;ans++) {
1015 final LookupResult c0 = expected.get(ans);
1016 final LookupResult c1 = actual.get(ans);
1017 assertEquals("expected " + c0.key +
1018 " but got " + c1.key,
1019 0,
1020 CHARSEQUENCE_COMPARATOR.compare(c0.key, c1.key));
1021 assertEquals(c0.value, c1.value);
1022 }
1023 assertEquals(expected.size(), actual.size());
1024 }
1025 a.close();
1026 }
1027
1028 private List<LookupResult> slowFuzzyMatch(int prefixLen, int maxEdits, boolean allowTransposition, List<Input> answers, String frag) {
1029 final List<LookupResult> results = new ArrayList<>();
1030 final int fragLen = frag.length();
1031 for(Input tf : answers) {
1032
1033 boolean prefixMatches = true;
1034 for(int i=0;i<prefixLen;i++) {
1035 if (i == fragLen) {
1036
1037 break;
1038 }
1039 if (i == tf.term.length || tf.term.bytes[i] != (byte) frag.charAt(i)) {
1040 prefixMatches = false;
1041 break;
1042 }
1043 }
1044
1045
1046 if (prefixMatches) {
1047 final int len = tf.term.length;
1048 if (len >= fragLen-maxEdits) {
1049
1050
1051 int d;
1052 final String s = tf.term.utf8ToString();
1053 if (fragLen == prefixLen) {
1054 d = 0;
1055 } else if (false && len < fragLen) {
1056 d = getDistance(frag, s, allowTransposition);
1057 } else {
1058
1059 d = maxEdits + 1;
1060
1061 for(int ed=-maxEdits;ed<=maxEdits;ed++) {
1062 if (s.length() < fragLen - ed) {
1063 continue;
1064 }
1065 String check = s.substring(0, fragLen-ed);
1066 d = getDistance(frag, check, allowTransposition);
1067
1068 if (d <= maxEdits) {
1069 break;
1070 }
1071 }
1072 }
1073 if (d <= maxEdits) {
1074 results.add(new LookupResult(tf.term.utf8ToString(), tf.v));
1075 }
1076 }
1077 }
1078
1079 Collections.sort(results, new CompareByCostThenAlpha());
1080 }
1081
1082 return results;
1083 }
1084
1085 private static class CharSequenceComparator implements Comparator<CharSequence> {
1086
1087 @Override
1088 public int compare(CharSequence o1, CharSequence o2) {
1089 final int l1 = o1.length();
1090 final int l2 = o2.length();
1091
1092 final int aStop = Math.min(l1, l2);
1093 for (int i = 0; i < aStop; i++) {
1094 int diff = o1.charAt(i) - o2.charAt(i);
1095 if (diff != 0) {
1096 return diff;
1097 }
1098 }
1099
1100 return l1 - l2;
1101 }
1102 }
1103
1104 private static final Comparator<CharSequence> CHARSEQUENCE_COMPARATOR = new CharSequenceComparator();
1105
1106 public class CompareByCostThenAlpha implements Comparator<LookupResult> {
1107 @Override
1108 public int compare(LookupResult a, LookupResult b) {
1109 if (a.value > b.value) {
1110 return -1;
1111 } else if (a.value < b.value) {
1112 return 1;
1113 } else {
1114 final int c = CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
1115 assert c != 0: "term=" + a.key;
1116 return c;
1117 }
1118 }
1119 }
1120
1121
1122
1123
1124
1125
1126
1127
1128 public int getDistance(String target, String other, boolean allowTransposition) {
1129 IntsRef targetPoints;
1130 IntsRef otherPoints;
1131 int n;
1132 int d[][];
1133
1134
1135
1136
1137
1138
1139
1140
1141 targetPoints = toIntsRef(target);
1142 otherPoints = toIntsRef(other);
1143 n = targetPoints.length;
1144 final int m = otherPoints.length;
1145 d = new int[n+1][m+1];
1146
1147 if (n == 0 || m == 0) {
1148 if (n == m) {
1149 return 0;
1150 }
1151 else {
1152 return Math.max(n, m);
1153 }
1154 }
1155
1156
1157 int i;
1158 int j;
1159
1160 int t_j;
1161
1162 int cost;
1163
1164 for (i = 0; i<=n; i++) {
1165 d[i][0] = i;
1166 }
1167
1168 for (j = 0; j<=m; j++) {
1169 d[0][j] = j;
1170 }
1171
1172 for (j = 1; j<=m; j++) {
1173 t_j = otherPoints.ints[j-1];
1174
1175 for (i=1; i<=n; i++) {
1176 cost = targetPoints.ints[i-1]==t_j ? 0 : 1;
1177
1178 d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
1179
1180 if (allowTransposition && i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {
1181 d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
1182 }
1183 }
1184 }
1185
1186 return d[n][m];
1187 }
1188
1189 private static IntsRef toIntsRef(String s) {
1190 IntsRef ref = new IntsRef(s.length());
1191 int utf16Len = s.length();
1192 for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
1193 cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
1194 }
1195 return ref;
1196 }
1197 }